Library¶
import pandas as pd
import numpy as np
import PreProcessingText as ppt
from collections import Counter, defaultdict
import seaborn as sns
from wordcloud import WordCloud
import networkx as nx
import matplotlib.pyplot as plt
import squarify
from transformers import pipeline
from tqdm import tqdm
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer, util
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from keybert import KeyBERT
from umap import UMAP
import hdbscan
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import csv
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.cluster import KMeans
from scipy.spatial import distance
from scipy.cluster import hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from matplotlib.colors import ListedColormap
4° Approach: BERTopic¶
Baseline Summary¶
Clustering Approach¶
- Parameter Setting: A high parameter was set for HDBSCAN to ensure well-defined clusters that occupy a significant percentage of the total dataset. This baseline is intended to feed machine learning algorithms for prediction purposes.
Initial Clustering Results¶
Clusters Retrieved: 7 representative clusters were identified:
- Drug sales
- Bitcoin
- Scammers and seller reviews
- Marketplace advertising
- Purchase reviews
- Drug purchases
- Orders
Outliers: Initially, 34k outliers were found out of a total of 66k records.
Performance Metrics:
- Silhouette Score: 0.64
- Davies-Bouldin Score: 0.6
Outlier Reduction¶
Cosine Measure on Embeddings: By applying a cosine similarity measure with a 0.53 threshold, the number of outliers was reduced from 34k to 27k, reintroducing about 7k records.
Updated Performance Metrics:
- Silhouette Score: 0.51
- Davies-Bouldin Score: 0.8
Trade-off Analysis¶
Outlier Reintroduction: Reintroducing the outliers found a balance that prevented significant cluster degradation while keeping clusters well separated and defined, as evidenced by the graphs.
Cluster Distribution: The updated clusters are well-distributed:
- Maximum cluster size: 23% of the total dataset
- Minimum cluster size: 7% of the total dataset
- This distribution avoids large excursions.
Data Loss and Potential Adjustments¶
Data Loss: Approximately 40% of the initial dataset was lost.
Potential Correction: This data loss can potentially be mitigated by lowering the cosine similarity threshold between embeddings.
df = pd.read_csv('cleaned_data_name_thread.csv')
df = df.dropna(subset=['name_thread'])
df = df.drop_duplicates(subset=['name_thread'], keep='first')
df.shape[0]
66735
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
tc1 = ppt.TextClustering(df, 'name_thread')
tc1.encode_corpus(model, batch_size=64, to_tensor=False)
len(tc1.corpus), len(tc1.corpus_embeddings)
seed_topic_list = [[
'tor site', 'drug', 'cocaine', 'ketamine', 'weed', 'trafficking', 'scammer', 'market', 'vendor', 'bitcoin',
'mdma', 'coke', 'lsd', 'heroine', 'xanax', 'tor node', 'tor site', 'gun', 'weapon', 'hacking'
]]
zeroshot_topic_list = [pd.read_csv('../../../intent_crime.csv')['intent'].tolist()]
representation_model = MaximalMarginalRelevance(diversity=0.3)
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=1200, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model = BERTopic(
language='multilingual',
top_n_words=10,
n_gram_range=(1, 2),
umap_model=umap_model,
hdbscan_model=hdbscan_model,
seed_topic_list=seed_topic_list,
vectorizer_model=vectorizer_model,
ctfidf_model=ctfidf_model,
representation_model=representation_model,
zeroshot_topic_list=zeroshot_topic_list,
zeroshot_min_similarity=.05,
verbose=True
)
topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
print(topic_model.get_topic_info())
for topic_id in set(topics):
print(f"Topic {topic_id}:")
print(topic_model.get_topic(topic_id))
Topic Count Name \
0 -1 34449 -1_new_free_ticket_help
1 0 7495 0_weed_xanax_cocaine_coke
2 1 6093 1_market_dream_empire_nightmare
3 2 5034 2_vendor_scammer_scam_scamming
4 3 4087 3_review_vendor_feedback_mdma
5 4 4003 4_mdma_lsd_shit_whats
6 5 2402 5_order_package_delivery_shipping
7 6 1966 6_bitcoin_card_wallet_credit
Representation \
0 [new, free, ticket, help, update, account, mdm...
1 [weed, xanax, cocaine, coke, ketamine, mg, can...
2 [market, dream, empire, nightmare, vendor, wal...
3 [vendor, scammer, scam, scamming, exit, scamme...
4 [review, vendor, feedback, mdma, mg, sample, r...
5 [mdma, lsd, shit, whats, fuck, gone, got, guy,...
6 [order, package, delivery, shipping, tracking,...
7 [bitcoin, card, wallet, credit, coin, carding,...
Representative_Docs
0 [canadianflavor weed shatter cbd edible hash c...
1 [high quality weed thc product europe, new xan...
2 [next market, dream market vendor rstclass nig...
3 [looking good reliable vendor sell ounce, vend...
4 [empire vendor cocaine review, first ever revi...
5 [hey ro im gon na pull pk, life wonderful life...
6 [order accepted day still hasnt marked shipped...
7 [credit cards paypal prepaid card find, got cc...
Topic 0:
[('weed', 0.5972313505812425), ('xanax', 0.5664832282989213), ('cocaine', 0.5350787342936356), ('coke', 0.4710111701375004), ('ketamine', 0.46985128023380035), ('mg', 0.46256209204548415), ('cannabis', 0.41853925594172725), ('drug', 0.4053330171594432), ('pill', 0.3907822559981816), ('quality', 0.38621568363790615)]
Topic 1:
[('market', 0.892430998800942), ('dream', 0.6865843677324943), ('empire', 0.6830028029033173), ('nightmare', 0.5681939396872522), ('vendor', 0.34305231363817884), ('wall', 0.3245499595042113), ('marketplace', 0.319921898437173), ('scam', 0.2961241301762431), ('exit', 0.2960733863924834), ('link', 0.2915460778160393)]
Topic 2:
[('vendor', 0.6950361459297074), ('scammer', 0.6725026815231682), ('scam', 0.4980623980369779), ('scamming', 0.46575246018365657), ('exit', 0.44160475610894967), ('scammed', 0.40051759892624533), ('looking', 0.37884048200047027), ('warning', 0.37715463753082534), ('reliable', 0.37144259341974245), ('buyer', 0.3708904841304073)]
Topic 3:
[('review', 1.002255217202406), ('vendor', 0.5076272530565451), ('feedback', 0.4049037794348937), ('mdma', 0.381329954044546), ('mg', 0.37619091451980585), ('sample', 0.3754397070467268), ('reviews', 0.3504300951320543), ('lsd', 0.3465899767001684), ('opinion', 0.3303160657068881), ('xanax', 0.33022254366369147)]
Topic 4:
[('mdma', 0.38275973612659386), ('lsd', 0.3779572278615291), ('shit', 0.35340590919386444), ('whats', 0.34834774258692336), ('fuck', 0.3264035078860319), ('gone', 0.31797094824590016), ('got', 0.3167851762249627), ('guy', 0.3153758862961693), ('dead', 0.31361936874635366), ('going', 0.3042237209259171)]
Topic 5:
[('order', 0.9350712100343167), ('package', 0.6655706541276237), ('delivery', 0.562721266995139), ('shipping', 0.527231820138037), ('tracking', 0.5122872117651205), ('shipped', 0.48839280205239965), ('ordering', 0.4784769909883374), ('cancelled', 0.47119974969542505), ('pack', 0.4566507281813944), ('delivered', 0.45351148583756845)]
Topic 6:
[('bitcoin', 0.8235475804294793), ('card', 0.7734286502423073), ('wallet', 0.6772588642347616), ('credit', 0.6731588060336892), ('coin', 0.5703668040987371), ('carding', 0.5529443276986676), ('btc', 0.5121844608207589), ('cash', 0.5037356917020909), ('debit', 0.500260454896595), ('coinbase', 0.49454000630077194)]
Topic -1:
[('new', 0.28398750337326484), ('free', 0.2771677713524054), ('ticket', 0.2699448449851029), ('help', 0.2697705189262906), ('update', 0.2675394807401724), ('account', 0.26547262677161937), ('mdma', 0.2638718211547908), ('vendor', 0.2588459510247759), ('dispute', 0.25440435619535773), ('need', 0.2488688355528112)]
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.6388389468193054 Davies_bouldin_score: 0.5523262827209047
best_indices = np.argsort(silhouette_scores)[-10:]
best_umap_embeddings = umap_embeddings[best_indices]
plt.figure(figsize=(10, 5))
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar()
plt.title('UMAP projection of the topics with highest silhouette scores', fontsize=24)
plt.show()
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
topic_model.get_topic_freq()
| Topic | Count | |
|---|---|---|
| 0 | -1 | 34449 |
| 5 | 0 | 7495 |
| 2 | 1 | 6093 |
| 1 | 2 | 5034 |
| 7 | 3 | 4087 |
| 6 | 4 | 4003 |
| 3 | 5 | 2402 |
| 4 | 6 | 1966 |
print(topic_model.get_topic_info())
for topic_id in set(topics):
print(f"Topic {topic_id}:")
print(topic_model.get_topic(topic_id))
Topic Count Name \
0 -1 34449 -1_vendor_new_free_help
1 0 7495 0_weed_vendor_xanax_mg
2 1 6093 1_market_empire_dream_vendor
3 2 5034 2_vendor_scammer_scam_looking
4 3 4087 3_review_vendor review_vendor_review vendor
5 4 4003 4_mdma_lsd_good_got
6 5 2402 5_order_package_shipping_delivery
7 6 1966 6_card_bitcoin_wallet_credit
Representation \
0 [vendor, new, free, help, best, account, uk, u...
1 [weed, vendor, xanax, mg, cocaine, uk, best, c...
2 [market, empire, dream, vendor, nightmare, dre...
3 [vendor, scammer, scam, looking, scamming, exi...
4 [review, vendor review, vendor, review vendor,...
5 [mdma, lsd, good, got, shit, whats, guy, fuck,...
6 [order, package, shipping, delivery, vendor, p...
7 [card, bitcoin, wallet, credit, btc, carding, ...
Representative_Docs
0 [canadianflavor weed shatter cbd edible hash c...
1 [high quality weed thc product europe, new xan...
2 [next market, dream market vendor rstclass nig...
3 [looking good reliable vendor sell ounce, vend...
4 [empire vendor cocaine review, first ever revi...
5 [hey ro im gon na pull pk, life wonderful life...
6 [order accepted day still hasnt marked shipped...
7 [credit cards paypal prepaid card find, got cc...
Topic 0:
[('weed', 0.02425497350614531), ('vendor', 0.021978341010015688), ('xanax', 0.02077949072716719), ('mg', 0.01948517638840499), ('cocaine', 0.018417804414484252), ('uk', 0.015046793957699879), ('best', 0.013425752943917355), ('coke', 0.012717130457267087), ('ketamine', 0.01175969464362258), ('cannabis', 0.010948216683877144)]
Topic 1:
[('market', 0.09008978566905657), ('empire', 0.055274112551010335), ('dream', 0.04917325935832957), ('vendor', 0.024276714575283735), ('nightmare', 0.023605168431774765), ('dream market', 0.016025449931173885), ('empire market', 0.014646720705699409), ('new', 0.009033909010090109), ('nightmare market', 0.008867402221856543), ('scam', 0.006303868464254871)]
Topic 2:
[('vendor', 0.09965429794348642), ('scammer', 0.025788920958809015), ('scam', 0.017833603310448354), ('looking', 0.01337570071081538), ('scamming', 0.012208815488636926), ('exit', 0.011806364340026236), ('scammed', 0.008689720115543394), ('uk', 0.008678133768927804), ('good', 0.008493482524539575), ('warning', 0.008418582129949287)]
Topic 3:
[('review', 0.1428141634073404), ('vendor review', 0.058876246025626515), ('vendor', 0.05315846344525214), ('review vendor', 0.021049951157661017), ('review vendor review', 0.017406474951027713), ('review review', 0.015138695407876355), ('mg', 0.012888546716744416), ('mdma', 0.011146461993445255), ('sample', 0.010133356066428198), ('dream', 0.009783289767907996)]
Topic 4:
[('mdma', 0.011231558108969678), ('lsd', 0.009238251834183116), ('good', 0.007359917621616781), ('got', 0.006638868206622288), ('shit', 0.0065802885463340675), ('whats', 0.006051630264178851), ('guy', 0.005697866126116449), ('fuck', 0.005394916465354471), ('going', 0.005375411718474036), ('wsm', 0.0052967375805114646)]
Topic 5:
[('order', 0.09533424569336707), ('package', 0.025076372096897597), ('shipping', 0.02284913659637588), ('delivery', 0.018139605364174704), ('vendor', 0.014195026757439324), ('pack', 0.014024930561711633), ('tracking', 0.012976075064416448), ('shipped', 0.012741042718045418), ('ordering', 0.01153929794529684), ('time', 0.01087192180365464)]
Topic 6:
[('card', 0.04045581193563761), ('bitcoin', 0.03526436871145481), ('wallet', 0.02671909128748556), ('credit', 0.02286661027552805), ('btc', 0.0196385675748142), ('carding', 0.018970779081355412), ('coin', 0.016677548495845462), ('credit card', 0.014601870612078016), ('cash', 0.012420616388040553), ('bank', 0.010979756425111214)]
Topic -1:
[('vendor', 0.013820616851140987), ('new', 0.009152016420677532), ('free', 0.006913858221511509), ('help', 0.006453408973195096), ('best', 0.0060032500179123234), ('account', 0.005801364375676093), ('uk', 0.005664162822486113), ('update', 0.005547486073465391), ('crosspost', 0.005503646525948444), ('need', 0.00541678801673178)]
topic_model.visualize_topics()
topic_model.visualize_heatmap()
topic_model.visualize_hierarchy()
reduced_embeddings = UMAP(n_neighbors=15, n_components=2,
min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True)
topic_model.visualize_barchart()
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.53)
topic_model.update_topics(tc1.corpus, topics=new_topics)
topic_model.get_topic_info()
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 27613 | -1_anyone_new_help_free | [anyone, new, help, free, please, update, tick... | [canadianflavor weed shatter cbd edible hash c... |
| 1 | 0 | 8645 | 0_weed_xanax_vendor_cocaine | [weed, xanax, vendor, cocaine, mg, uk, coke, b... | [high quality weed thc product europe, new xan... |
| 2 | 1 | 6236 | 1_market_empire_dream_nightmare | [market, empire, dream, nightmare, vendor, dre... | [next market, dream market vendor rstclass nig... |
| 3 | 2 | 6907 | 2_vendor_scammer_scam_looking | [vendor, scammer, scam, looking, scamming, sal... | [looking good reliable vendor sell ounce, vend... |
| 4 | 3 | 4230 | 3_review_vendor review_vendor_review vendor | [review, vendor review, vendor, review vendor,... | [empire vendor cocaine review, first ever revi... |
| 5 | 4 | 6299 | 4_mdma_lsd_get_looking | [mdma, lsd, get, looking, wsm, good, btc, ques... | [hey ro im gon na pull pk, life wonderful life... |
| 6 | 5 | 2776 | 5_order_package_shipping_delivery | [order, package, shipping, delivery, pack, shi... | [order accepted day still hasnt marked shipped... |
| 7 | 6 | 2823 | 6_bitcoin_card_wallet_btc | [bitcoin, card, wallet, btc, bank, credit, car... | [credit cards paypal prepaid card find, got cc... |
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True)
topic_model.visualize_hierarchy()
topic_model.visualize_topics()
topic_model.visualize_barchart()
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.5083789229393005 Davies_bouldin_score: 0.7570962651091117
df['name_thread'] = df['name_thread'].str.lower().dropna()
df.drop_duplicates(subset='name_thread', inplace=True)
df.dropna(subset=['name_thread'], inplace=True)
created_on = df['created_on'].tolist()
len(created_on)
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on,
global_tuning=True, evolution_tuning=True, nr_bins=100)
topic_model.visualize_topics_over_time(topics_over_time, width=1250, height=700)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]
results = pd.DataFrame({
'Document': corpus_valid,
'Embedding': embeddings_valid,
'Topic': topics_valid,
'Probability': probs_valid,
'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')
results_final['UMAP_embedding'] = list(X)
print(results_final.shape)
results_final.head()
(37916, 10)
| Document | Embedding | Topic | Probability | Created_on | Count | Name | Representation | Representative_Docs | UMAP_embedding | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | checks | [0.052164897, 0.029597273, -0.03666609, 0.0051... | 4 | 0.000000 | 2020-01-09 | 6299 | 4_mdma_lsd_get_looking | [mdma, lsd, get, looking, wsm, good, btc, ques... | [hey ro im gon na pull pk, life wonderful life... | [1.6488198, 9.914265, 1.442794, 2.8094368, -0.... |
| 1 | trusted vendor status | [0.02445144, -0.008732641, -0.0050215074, 0.01... | 2 | 0.944247 | 2020-01-09 | 6907 | 2_vendor_scammer_scam_looking | [vendor, scammer, scam, looking, scamming, sal... | [looking good reliable vendor sell ounce, vend... | [2.910516, 10.281041, 1.650234, 3.0320778, -0.... |
| 2 | empire exit scam iiflux user incomming | [0.02890829, 0.036081452, -0.027694924, -0.007... | 1 | 1.000000 | 2019-11-06 | 6236 | 1_market_empire_dream_nightmare | [market, empire, dream, nightmare, vendor, dre... | [next market, dream market vendor rstclass nig... | [1.5884036, 9.8587885, 3.3090453, 2.652358, 2.... |
| 3 | ecstasy vendor packs | [-0.022524439, 0.03949761, -0.023750877, 0.033... | 5 | 0.797741 | 2020-01-09 | 2776 | 5_order_package_shipping_delivery | [order, package, shipping, delivery, pack, shi... | [order accepted day still hasnt marked shipped... | [2.0245404, 10.517631, 2.3443217, 3.7595236, -... |
| 4 | opening bank account person fake id | [-0.029834118, 0.03354508, -0.012210185, -0.02... | 6 | 1.000000 | 2019-11-06 | 2823 | 6_bitcoin_card_wallet_btc | [bitcoin, card, wallet, btc, bank, credit, car... | [credit cards paypal prepaid card find, got cc... | [0.7278271, 9.884823, 1.8116106, 2.9336705, -0... |
topic_model.save("Models/topic_model_0.64SilNew", serialization='pickle')
results_final.to_parquet('ResultsBERTopic/BERTopic_nodefinedcluster_topics_15n_10com_1200cluster_0.64sil_renewout.parquet')
sns.histplot(results_final, x='Topic', discrete=True);
plt.pie(results_final.value_counts('Topic'), labels=results_final.value_counts('Topic').index, autopct='%1.1f%%');
500 min cluster size¶
seed_topic_list = [[
'tor site', 'drug', 'cocaine', 'ketamine', 'weed', 'trafficking', 'scammer', 'market', 'vendor', 'bitcoin',
'mdma', 'coke', 'lsd', 'heroine', 'xanax', 'tor node', 'tor site', 'gun', 'weapon', 'hacking'
]]
zeroshot_topic_list = [pd.read_csv('../../../intent_crime.csv')['intent'].tolist()]
representation_model = MaximalMarginalRelevance(diversity=0.3)
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=500, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model = BERTopic(
language='multilingual',
top_n_words=10,
n_gram_range=(1, 2),
umap_model=umap_model,
hdbscan_model=hdbscan_model,
seed_topic_list=seed_topic_list,
vectorizer_model=vectorizer_model,
ctfidf_model=ctfidf_model,
representation_model=representation_model,
zeroshot_topic_list=zeroshot_topic_list,
zeroshot_min_similarity=.05,
verbose=True
)
topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
topic_model.get_topic_info()
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 28000 | -1_mdma_new_link_lsd | [mdma, new, link, lsd, help, free, vendor, nee... | [need high quality fake id check , big thanks ... |
| 1 | 0 | 4930 | 0_xanax_coke_cocaine_ketamine | [xanax, coke, cocaine, ketamine, mg, drug, pil... | [promo sale mg adderall ad xanax mg lsd mdma u... |
| 2 | 1 | 4469 | 1_bitcoin_card_bank_carding | [bitcoin, card, bank, carding, monero, wallet,... | [way cash bank log using btc, send bitcoin get... |
| 3 | 2 | 4227 | 2_dread_sub_lsd_shit | [dread, sub, lsd, shit, mdma, whats, guy, fuck... | [hey guy xangod man, let guy know dread host w... |
| 4 | 3 | 3702 | 3_market_dream_nightmare_dreammarket | [market, dream, nightmare, dreammarket, market... | [not order nightmare market, nightmare market ... |
| 5 | 4 | 3469 | 4_review_vendor_reviews_mg | [review, vendor, reviews, mg, vendymcvendface,... | [thclear ml purple kush vape cart review, vend... |
| 6 | 5 | 3410 | 5_order_package_pack_dispute | [order, package, pack, dispute, delivery, ship... | [package custom month love letter nothing, pac... |
| 7 | 6 | 2700 | 6_vendor_looking_seller_vendors | [vendor, looking, seller, vendors, buyer, lsd,... | [best vendor uk lsd, looking good vendor cc fu... |
| 8 | 7 | 1694 | 7_weed_cannabis_marijuana_hash | [weed, cannabis, marijuana, hash, quality, str... | [hash weed ship eu good vendor also usa, new i... |
| 9 | 8 | 1540 | 8_darknet_dark_web_sentenced | [darknet, dark, web, sentenced, drug, darkweb,... | [tacoma man sentenced four year dealing drugs ... |
| 10 | 9 | 1502 | 9_empire_dispute_deposit_empiremarket | [empire, dispute, deposit, empiremarket, scamm... | [empire next, give me empire, empire anyone else] |
| 11 | 10 | 1475 | 10_account_password_pgp_hacking | [account, password, pgp, hacking, hacked, secu... | [vendor enerygcontrolled hacked ca nt log pass... |
| 12 | 11 | 1314 | 11_tried_anybody_heard_ordered | [tried, anybody, heard, ordered, used, recentl... | [anybody heard pasitheas, anyone order recentl... |
| 13 | 12 | 1031 | 12_scammer_scam_exit_scamming | [scammer, scam, exit, scamming, warning, scamm... | [xangod scammer going exit scam proof, cottage... |
| 14 | 13 | 777 | 13_update_maintenance_updated_upgrade | [update, maintenance, updated, upgrade, vender... | [shipping update, update order, vendor update] |
| 15 | 14 | 681 | 14_ticket_support_deposit_month | [ticket, support, deposit, month, response, an... | [support ticket ticket, please help support ti... |
| 16 | 15 | 608 | 15_sample_samples_free_test | [sample, samples, free, test, testing, lab, te... | [xanax mg shipping free samples, new vendor fr... |
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.5718363523483276 Davies_bouldin_score: 0.6211900149809264
best_indices = np.argsort(silhouette_scores)[-10:]
best_umap_embeddings = umap_embeddings[best_indices]
unique_labels = np.unique(labels)
cmap = plt.cm.magma
plt.figure(figsize=(10, 5))
scatter = plt.scatter(X[:, 1], X[:, 2], c=labels, cmap=cmap, s=5)
plt.gca().set_aspect('equal', 'datalim')
norm = plt.Normalize(vmin=min(labels), vmax=max(labels))
handles = [plt.Line2D([0], [0], marker='o', color=cmap(norm(label)), linestyle='', markersize=10) for label in unique_labels]
legend_labels = [f'Class {label}' for label in unique_labels]
plt.legend(handles, legend_labels, title="Classes")
plt.colorbar(scatter, ticks=range(len(unique_labels)))
plt.title('UMAP projection of the topics with highest silhouette scores', fontsize=24)
plt.show()
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
topic_model.visualize_topics()
topic_model.visualize_heatmap()
topic_model.visualize_hierarchy()
reduced_embeddings = UMAP(n_neighbors=15, n_components=2,
min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True)
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.6)
topic_model.update_topics(tc1.corpus, topics=new_topics)
topic_model.get_topic_info()
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 23928 | -1_new_vendor_help_uk | [new, vendor, help, uk, need, mdma, best, free... | [need high quality fake id check , big thanks ... |
| 1 | 0 | 5207 | 0_xanax_cocaine_mg_coke | [xanax, cocaine, mg, coke, ketamine, vendor, p... | [promo sale mg adderall ad xanax mg lsd mdma u... |
| 2 | 1 | 4512 | 1_bitcoin_card_bank_carding | [bitcoin, card, bank, carding, monero, wallet,... | [way cash bank log using btc, send bitcoin get... |
| 3 | 2 | 4944 | 2_dread_mdma_lsd_get | [dread, mdma, lsd, get, sub, shit, guy, lookin... | [hey guy xangod man, let guy know dread host w... |
| 4 | 3 | 3801 | 3_market_dream_nightmare_dream market | [market, dream, nightmare, dream market, vendo... | [not order nightmare market, nightmare market ... |
| 5 | 4 | 3706 | 4_review_vendor review_vendor_review vendor | [review, vendor review, vendor, review vendor,... | [thclear ml purple kush vape cart review, vend... |
| 6 | 5 | 3434 | 5_order_dispute_pack_package | [order, dispute, pack, package, shipping, deli... | [package custom month love letter nothing, pac... |
| 7 | 6 | 4123 | 6_vendor_vendor vendor_looking_best | [vendor, vendor vendor, looking, best, inquiry... | [best vendor uk lsd, looking good vendor cc fu... |
| 8 | 7 | 1848 | 7_weed_cannabis_uk_weed vendor | [weed, cannabis, uk, weed vendor, vendor, qual... | [hash weed ship eu good vendor also usa, new i... |
| 9 | 8 | 1557 | 8_darknet_dark_dark web_web | [darknet, dark, dark web, web, drug, sentenced... | [tacoma man sentenced four year dealing drugs ... |
| 10 | 9 | 1835 | 9_empire_empire market_market_empire empire | [empire, empire market, market, empire empire,... | [empire next, give me empire, empire anyone else] |
| 11 | 10 | 1542 | 10_account_pgp_password_vendor account | [account, pgp, password, vendor account, crypt... | [vendor enerygcontrolled hacked ca nt log pass... |
| 12 | 11 | 1394 | 11_anyone_has_has anyone_anybody | [anyone, has, has anyone, anybody, tried, anyo... | [anybody heard pasitheas, anyone order recentl... |
| 13 | 12 | 1398 | 12_scammer_scam_exit_scamming | [scammer, scam, exit, scamming, scammed, warni... | [xangod scammer going exit scam proof, cottage... |
| 14 | 13 | 826 | 13_update_maintenance_updated_update update | [update, maintenance, updated, update update, ... | [shipping update, update order, vendor update] |
| 15 | 14 | 682 | 14_ticket_support ticket_support_please | [ticket, support ticket, support, please, depo... | [support ticket ticket, please help support ti... |
| 16 | 15 | 792 | 15_sample_free_free sample_samples | [sample, free, free sample, samples, free samp... | [xanax mg shipping free samples, new vendor fr... |
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True)
topic_model.visualize_topics()
topic_model.visualize_hierarchy()
topic_model.visualize_barchart(top_n_topics=16)
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.49986162781715393 Davies_bouldin_score: 0.7193546666619981
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]
results = pd.DataFrame({
'Document': corpus_valid,
'Embedding': embeddings_valid,
'Topic': topics_valid,
'Probability': probs_valid,
'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')
results_final['UMAP_embedding'] = list(X)
print(results_final.shape)
results_final.head()
results_final.to_parquet('ResultsBERTopic/BERTopic_nodefinedcluster_topics_15n_10com_500cluster_0.54sil_renewout.parquet')
(41601, 10)
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on,
global_tuning=True, evolution_tuning=True, nr_bins=100)
topic_model.visualize_topics_over_time(topics_over_time, width=1250, height=700)
plt.pie(results_final.value_counts('Topic'), labels=results_final.value_counts('Topic').index, autopct='%1.1f%%');
sns.histplot(results_final, x='Topic', discrete=True);
topic_model.save("Models/topic_model_0.50Sil300", serialization='pickle')
400 all-MiniLM-L6-v2¶
df = pd.read_csv('cleaned_data_name_thread.csv')
df = df.dropna(subset=['name_thread'])
df = df.drop_duplicates(subset=['name_thread'], keep='first')
df.shape[0]
66735
model = SentenceTransformer('all-MiniLM-L6-v2')
tc1 = ppt.TextClustering(df, 'name_thread')
tc1.encode_corpus(model, batch_size=64, to_tensor=False)
mmr = MaximalMarginalRelevance(diversity=0.3)
kw = KeyBERTInspired()
vectorizer_model = CountVectorizer(stop_words="english")
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=400, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model = BERTopic(
top_n_words=10,
n_gram_range=(1, 2),
umap_model=umap_model,
hdbscan_model=hdbscan_model,
vectorizer_model=vectorizer_model,
ctfidf_model=ctfidf_model,
representation_model=[mmr, kw],
embedding_model=model,
verbose=True
)
topics, probs = topic_model.fit_transform(tc1.corpus, tc1.corpus_embeddings)
print(topic_model.get_topic_info())
for topic_id in set(topics):
print(f"Topic {topic_id}:")
print(topic_model.get_topic(topic_id))
Topic Count Name \
0 -1 30941 -1_customer_buy_sale_buyer
1 0 5117 0_vape_shatter_carts_cartridge
2 1 2643 1_login_password_logged_error
3 2 2579 2_coca_opium_cocain_cocacolacompany
4 3 2124 3_xanaxlabs_xanaxlife_xanax_xanaxusa
5 4 1938 4_postal_usps_delivery_postage
6 5 1842 5_darkweb_darknetlive_darknetmarkets_sentenced
7 6 1721 6_empire_empiremarket_empireteam_empiredealer
8 7 1631 7_mdma_mdmamaster_pill_ecstasydata
9 8 1601 8_giftcard_card_giftcards_mastercard
10 9 1502 9_vendor_vendorpro_vendors_vendorbbmc
11 10 1417 10_scamming_scammer_scam_scammers
12 11 1126 11_counterfeiting_passport_counterfeit_fakeid
13 12 1072 12_dreammarket_nightmaremarket_market_dreams
14 13 979 13_lsd_tab_tabs_shrooms
15 14 739 14_monero_coinbase_coin_coins
16 15 676 15_review_reviewing_reviews_reviewer
17 16 674 16_pickledrick_heard_theoutfit_muttznutz
18 17 669 17_market_markets_marketplace_marketing
19 18 626 18_crosspost_deposting_goingpostal_vendors
20 19 603 19_deposit_depositing_deposits_ticket
21 20 573 20_pgpkey_pgp_pgps_pg
22 21 535 21_mod_moderator_dispute_disputes
23 22 450 22_cryptonia_cryptoniausers_cryptonians_cryptn...
24 23 445 23_wsm_wsms_vendorcp_machinerymint
25 24 443 24_ketamine_ketamin_ketamineking_ketaminekings
26 25 434 25_ticket_ticketmaster_ticketw_tickets
27 26 429 26_meth_methbusters_methamphetamine_crystal
Representation \
0 [customer, buy, sale, buyer, service, message,...
1 [vape, shatter, carts, cartridge, ounce, marij...
2 [login, password, logged, error, problem, log,...
3 [coca, opium, cocain, cocacolacompany, coke, c...
4 [xanaxlabs, xanaxlife, xanax, xanaxusa, xanaxr...
5 [postal, usps, delivery, postage, mail, delive...
6 [darkweb, darknetlive, darknetmarkets, sentenc...
7 [empire, empiremarket, empireteam, empiredeale...
8 [mdma, mdmamaster, pill, ecstasydata, mdmaus, ...
9 [giftcard, card, giftcards, mastercard, cards,...
10 [vendor, vendorpro, vendors, vendorbbmc, vendo...
11 [scamming, scammer, scam, scammers, scammed, s...
12 [counterfeiting, passport, counterfeit, fakeid...
13 [dreammarket, nightmaremarket, market, dreams,...
14 [lsd, tab, tabs, shrooms, acid, blotter, blott...
15 [monero, coinbase, coin, coins, cryptocurrency...
16 [review, reviewing, reviews, reviewer, reviewe...
17 [pickledrick, heard, theoutfit, muttznutz, hou...
18 [market, markets, marketplace, marketing, nonm...
19 [crosspost, deposting, goingpostal, vendors, c...
20 [deposit, depositing, deposits, ticket, deposi...
21 [pgpkey, pgp, pgps, pg, pgc, gnupg, key, gpg, ...
22 [mod, moderator, dispute, disputes, disputers,...
23 [cryptonia, cryptoniausers, cryptonians, crypt...
24 [wsm, wsms, vendorcp, machinerymint, wowza, pa...
25 [ketamine, ketamin, ketamineking, ketamineking...
26 [ticket, ticketmaster, ticketw, tickets, suppo...
27 [meth, methbusters, methamphetamine, crystal, ...
Representative_Docs
0 [dutchdrugz updates promo active till market p...
1 [sale girl scout cookie carts strains oz lb us...
2 [hey really could use help advice thanks, erro...
3 [colombian coke brazil ship world wide promoti...
4 [adderall mg ir adderall mg xanax super sale, ...
5 [informed delivery showing package, usa canada...
6 [three student arrested dark web drug traffick...
7 [empire anyone else, empire market back, empir...
8 [sale xtc pill mg mda us ca, uk mdma pill vend...
9 [carding amazon gift card, gift card prepaid d...
10 [nmm giving vendor runaround lying acting shad...
11 [market exit scam next, scam alert ukdrugdeale...
12 [buy counterfeit money real fake document, buy...
13 [dream market still, dream market, eleven drea...
14 [lsd blotter tab ug top quality, point one fre...
15 [looking best safe way buy large amount bitcoi...
16 [needing send sample bar trusted reviewer woul...
17 [anybody heard theoutfit, anybody heard pickle...
18 [market anyone else, market, currently working...
19 [envoy want crosspost, could vendor crosspost,...
20 [missing deposit double deposit please help, a...
21 [pgp public key, market pgp key, find pgp key]
22 [moderator dispute day, moderator please help ...
23 [cryptonia market, market king samsara crypton...
24 [wsm vendor, wsm back, wsm down]
25 [ketamine us, get ketamine, ketamine anyone]
26 [help support ticket please, help support tick...
27 [crystal meth uk, crystal meth, crystal meth v...
Topic 0:
[('vape', 0.4513024), ('shatter', 0.4508166), ('carts', 0.42475972), ('cartridge', 0.4150574), ('ounce', 0.38511506), ('marijuana', 0.3761327), ('cannabis', 0.37473193), ('edibles', 0.36946523), ('weed', 0.35874215), ('cart', 0.3494926)]
Topic 1:
[('login', 0.6874596), ('password', 0.58739483), ('logged', 0.44535103), ('error', 0.39473626), ('problem', 0.38404456), ('log', 0.3703017), ('account', 0.36962464), ('help', 0.36578366), ('trouble', 0.3579351), ('session', 0.34920555)]
Topic 2:
[('coca', 0.5442445), ('opium', 0.5241908), ('cocain', 0.48566723), ('cocacolacompany', 0.47682497), ('coke', 0.4701375), ('cocainehcl', 0.4403491), ('cocaine', 0.43470532), ('heroinfactory', 0.43406424), ('colombian', 0.40406665), ('cokemaster', 0.39702898)]
Topic 3:
[('xanaxlabs', 0.68098766), ('xanaxlife', 0.6694618), ('xanax', 0.64481914), ('xanaxusa', 0.5943617), ('xanaxring', 0.5927005), ('xanaxdepot', 0.5860753), ('xanaxdaddy', 0.57530177), ('xanaxblotters', 0.5676911), ('alprazolam', 0.5388765), ('xanaxinc', 0.5038374)]
Topic 4:
[('postal', 0.5783647), ('usps', 0.5671008), ('delivery', 0.552514), ('postage', 0.5435632), ('mail', 0.4794371), ('deliver', 0.46840727), ('package', 0.4595977), ('shipment', 0.4503156), ('shipping', 0.44325382), ('fedex', 0.44258836)]
Topic 5:
[('darkweb', 0.5460649), ('darknetlive', 0.47999817), ('darknetmarkets', 0.46108282), ('sentenced', 0.4581046), ('darknetmarketsnoobs', 0.4534067), ('darknet', 0.45285586), ('darkbay', 0.45059866), ('darkfail', 0.44140962), ('darkdotfail', 0.42702472), ('darknetaustralia', 0.42165762)]
Topic 6:
[('empire', 0.8657665), ('empiremarket', 0.8325376), ('empireteam', 0.7658358), ('empiredealer', 0.73584473), ('empires', 0.7089321), ('imperial', 0.59743464), ('imperialroyalty', 0.533589), ('market', 0.39446667), ('scammer', 0.3011508), ('nightmare', 0.29797795)]
Topic 7:
[('mdma', 0.57491755), ('mdmamaster', 0.55362886), ('pill', 0.54554516), ('ecstasydata', 0.54158187), ('mdmaus', 0.536477), ('mdacanada', 0.49906433), ('mda', 0.47733676), ('md', 0.47456974), ('ecstasy', 0.46981525), ('mg', 0.45221412)]
Topic 8:
[('giftcard', 0.68464833), ('card', 0.6067195), ('giftcards', 0.60337466), ('mastercard', 0.5686253), ('cards', 0.5325688), ('carding', 0.5214343), ('debit', 0.500812), ('carded', 0.49536285), ('carder', 0.48081687), ('cardable', 0.45047107)]
Topic 9:
[('vendor', 0.6717965), ('vendorpro', 0.64170885), ('vendors', 0.63945156), ('vendorbbmc', 0.6131782), ('vendorshop', 0.5619679), ('supplier', 0.4961744), ('shop', 0.43687624), ('inventory', 0.38063982), ('dealer', 0.37658587), ('trusted', 0.35675985)]
Topic 10:
[('scamming', 0.67339057), ('scammer', 0.64245546), ('scam', 0.6315777), ('scammers', 0.60618246), ('scammed', 0.5859374), ('scams', 0.5844768), ('exit', 0.38286078), ('ukdrugdealer', 0.37872887), ('warning', 0.35860184), ('confirmed', 0.3483911)]
Topic 11:
[('counterfeiting', 0.5351553), ('passport', 0.49532643), ('counterfeit', 0.48550797), ('fakeid', 0.46835682), ('forgery', 0.46821818), ('passports', 0.46553856), ('certificate', 0.46403533), ('fakeids', 0.36332572), ('licenses', 0.3491515), ('citizenship', 0.33687454)]
Topic 12:
[('dreammarket', 0.840524), ('nightmaremarket', 0.7301478), ('market', 0.679103), ('dreams', 0.5537206), ('nightmare', 0.54951864), ('dream', 0.52395815), ('dreaming', 0.51259714), ('nightmares', 0.5112673), ('dreamweaver', 0.4622426), ('deals', 0.4392535)]
Topic 13:
[('lsd', 0.6597349), ('tab', 0.4486916), ('tabs', 0.42244914), ('shrooms', 0.40983063), ('acid', 0.37709463), ('blotter', 0.3619333), ('blotters', 0.34030285), ('microdose', 0.31792137), ('dmt', 0.30784056), ('samspade', 0.306018)]
Topic 14:
[('monero', 0.66440576), ('coinbase', 0.6017641), ('coin', 0.58206344), ('coins', 0.55229485), ('cryptocurrency', 0.54781383), ('crypto', 0.5190888), ('bitcoin', 0.49815544), ('btc', 0.4951193), ('cryptocurrencies', 0.49073264), ('bitcoins', 0.48276216)]
Topic 15:
[('review', 0.7554549), ('reviewing', 0.70764035), ('reviews', 0.67082256), ('reviewer', 0.6707778), ('reviewed', 0.66799235), ('vendor', 0.3507808), ('post', 0.3232708), ('sample', 0.3039448), ('journal', 0.28708428), ('dankservices', 0.2783244)]
Topic 16:
[('pickledrick', 0.49188858), ('heard', 0.45528996), ('theoutfit', 0.4499943), ('muttznutz', 0.40856874), ('houseofdank', 0.38270152), ('purepharm', 0.3821613), ('thecandymanuk', 0.38004813), ('ndduk', 0.3797817), ('uzak', 0.37892848), ('turk', 0.37287065)]
Topic 17:
[('market', 0.9246511), ('markets', 0.82856095), ('marketplace', 0.66924006), ('marketing', 0.64059925), ('nonmarket', 0.63226146), ('undermarket', 0.5758176), ('traderoute', 0.5252505), ('farmersmarket', 0.51230544), ('demand', 0.48939776), ('trade', 0.4373095)]
Topic 18:
[('crosspost', 0.8023433), ('deposting', 0.54462177), ('goingpostal', 0.4369921), ('vendors', 0.3397432), ('courier', 0.31433263), ('tarred', 0.30136013), ('expose', 0.28236645), ('shop', 0.26232204), ('buyers', 0.25981808), ('weareamsterdam', 0.25617945)]
Topic 19:
[('deposit', 0.5940467), ('depositing', 0.54835135), ('deposits', 0.4703769), ('ticket', 0.4124618), ('deposited', 0.37039375), ('transaction', 0.32960162), ('btc', 0.29055083), ('fund', 0.28815228), ('unconfirmed', 0.28022093), ('twice', 0.27061075)]
Topic 20:
[('pgpkey', 0.78953433), ('pgp', 0.64266664), ('pgps', 0.60433674), ('pg', 0.57204497), ('pgc', 0.5202303), ('gnupg', 0.49523085), ('key', 0.4912796), ('gpg', 0.45877883), ('keys', 0.42667422), ('pgplogin', 0.40541986)]
Topic 21:
[('mod', 0.6461178), ('moderator', 0.6455801), ('dispute', 0.63188905), ('disputes', 0.53940743), ('disputers', 0.5393207), ('mods', 0.5271941), ('complaint', 0.47743487), ('modderator', 0.43813834), ('consensus', 0.3737623), ('handled', 0.37211758)]
Topic 22:
[('cryptonia', 0.82683897), ('cryptoniausers', 0.7519192), ('cryptonians', 0.7422215), ('cryptnonia', 0.6530852), ('cryptoni', 0.6209998), ('cryptoice', 0.5572725), ('market', 0.5073216), ('samasara', 0.42220467), ('samsera', 0.42188087), ('samsara', 0.3912958)]
Topic 23:
[('wsm', 0.8689953), ('wsms', 0.6338644), ('vendorcp', 0.41763154), ('machinerymint', 0.36969972), ('wowza', 0.36484522), ('paymwn', 0.32914096), ('maintenance', 0.31149185), ('greennz', 0.3085622), ('bionik', 0.30364022), ('bioniks', 0.30257553)]
Topic 24:
[('ketamine', 0.9532861), ('ketamin', 0.86957943), ('ketamineking', 0.8578399), ('ketaminekings', 0.8378519), ('ketaminehouse', 0.8028732), ('ketamax', 0.69982356), ('ketaconnect', 0.527894), ('tiletamine', 0.5001087), ('pyrimethamine', 0.48265585), ('pharmaceutical', 0.43739906)]
Topic 25:
[('ticket', 0.7282917), ('ticketmaster', 0.6860643), ('ticketw', 0.65911514), ('tickets', 0.62922376), ('support', 0.51385075), ('concert', 0.37351736), ('help', 0.29014573), ('assist', 0.28098187), ('fix', 0.27553594), ('outstanding', 0.27276954)]
Topic 26:
[('meth', 0.7546984), ('methbusters', 0.71206135), ('methamphetamine', 0.6617794), ('crystal', 0.6237694), ('methamph', 0.6163767), ('methoxetamine', 0.6146395), ('methadone', 0.58694017), ('dmethamphetamine', 0.5264992), ('methaqualone', 0.49982086), ('amphetamine', 0.49571955)]
Topic -1:
[('customer', 0.44219303), ('buy', 0.42263174), ('sale', 0.38992852), ('buyer', 0.38299185), ('service', 0.38183293), ('message', 0.37282392), ('update', 0.37055105), ('price', 0.37036857), ('paypal', 0.35097662), ('legit', 0.34381357)]
topic_model.get_topic_info()
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 30941 | -1_customer_buy_sale_buyer | [customer, buy, sale, buyer, service, message,... | [dutchdrugz updates promo active till market p... |
| 1 | 0 | 5117 | 0_vape_shatter_carts_cartridge | [vape, shatter, carts, cartridge, ounce, marij... | [sale girl scout cookie carts strains oz lb us... |
| 2 | 1 | 2643 | 1_login_password_logged_error | [login, password, logged, error, problem, log,... | [hey really could use help advice thanks, erro... |
| 3 | 2 | 2579 | 2_coca_opium_cocain_cocacolacompany | [coca, opium, cocain, cocacolacompany, coke, c... | [colombian coke brazil ship world wide promoti... |
| 4 | 3 | 2124 | 3_xanaxlabs_xanaxlife_xanax_xanaxusa | [xanaxlabs, xanaxlife, xanax, xanaxusa, xanaxr... | [adderall mg ir adderall mg xanax super sale, ... |
| 5 | 4 | 1938 | 4_postal_usps_delivery_postage | [postal, usps, delivery, postage, mail, delive... | [informed delivery showing package, usa canada... |
| 6 | 5 | 1842 | 5_darkweb_darknetlive_darknetmarkets_sentenced | [darkweb, darknetlive, darknetmarkets, sentenc... | [three student arrested dark web drug traffick... |
| 7 | 6 | 1721 | 6_empire_empiremarket_empireteam_empiredealer | [empire, empiremarket, empireteam, empiredeale... | [empire anyone else, empire market back, empir... |
| 8 | 7 | 1631 | 7_mdma_mdmamaster_pill_ecstasydata | [mdma, mdmamaster, pill, ecstasydata, mdmaus, ... | [sale xtc pill mg mda us ca, uk mdma pill vend... |
| 9 | 8 | 1601 | 8_giftcard_card_giftcards_mastercard | [giftcard, card, giftcards, mastercard, cards,... | [carding amazon gift card, gift card prepaid d... |
| 10 | 9 | 1502 | 9_vendor_vendorpro_vendors_vendorbbmc | [vendor, vendorpro, vendors, vendorbbmc, vendo... | [nmm giving vendor runaround lying acting shad... |
| 11 | 10 | 1417 | 10_scamming_scammer_scam_scammers | [scamming, scammer, scam, scammers, scammed, s... | [market exit scam next, scam alert ukdrugdeale... |
| 12 | 11 | 1126 | 11_counterfeiting_passport_counterfeit_fakeid | [counterfeiting, passport, counterfeit, fakeid... | [buy counterfeit money real fake document, buy... |
| 13 | 12 | 1072 | 12_dreammarket_nightmaremarket_market_dreams | [dreammarket, nightmaremarket, market, dreams,... | [dream market still, dream market, eleven drea... |
| 14 | 13 | 979 | 13_lsd_tab_tabs_shrooms | [lsd, tab, tabs, shrooms, acid, blotter, blott... | [lsd blotter tab ug top quality, point one fre... |
| 15 | 14 | 739 | 14_monero_coinbase_coin_coins | [monero, coinbase, coin, coins, cryptocurrency... | [looking best safe way buy large amount bitcoi... |
| 16 | 15 | 676 | 15_review_reviewing_reviews_reviewer | [review, reviewing, reviews, reviewer, reviewe... | [needing send sample bar trusted reviewer woul... |
| 17 | 16 | 674 | 16_pickledrick_heard_theoutfit_muttznutz | [pickledrick, heard, theoutfit, muttznutz, hou... | [anybody heard theoutfit, anybody heard pickle... |
| 18 | 17 | 669 | 17_market_markets_marketplace_marketing | [market, markets, marketplace, marketing, nonm... | [market anyone else, market, currently working... |
| 19 | 18 | 626 | 18_crosspost_deposting_goingpostal_vendors | [crosspost, deposting, goingpostal, vendors, c... | [envoy want crosspost, could vendor crosspost,... |
| 20 | 19 | 603 | 19_deposit_depositing_deposits_ticket | [deposit, depositing, deposits, ticket, deposi... | [missing deposit double deposit please help, a... |
| 21 | 20 | 573 | 20_pgpkey_pgp_pgps_pg | [pgpkey, pgp, pgps, pg, pgc, gnupg, key, gpg, ... | [pgp public key, market pgp key, find pgp key] |
| 22 | 21 | 535 | 21_mod_moderator_dispute_disputes | [mod, moderator, dispute, disputes, disputers,... | [moderator dispute day, moderator please help ... |
| 23 | 22 | 450 | 22_cryptonia_cryptoniausers_cryptonians_cryptn... | [cryptonia, cryptoniausers, cryptonians, crypt... | [cryptonia market, market king samsara crypton... |
| 24 | 23 | 445 | 23_wsm_wsms_vendorcp_machinerymint | [wsm, wsms, vendorcp, machinerymint, wowza, pa... | [wsm vendor, wsm back, wsm down] |
| 25 | 24 | 443 | 24_ketamine_ketamin_ketamineking_ketaminekings | [ketamine, ketamin, ketamineking, ketamineking... | [ketamine us, get ketamine, ketamine anyone] |
| 26 | 25 | 434 | 25_ticket_ticketmaster_ticketw_tickets | [ticket, ticketmaster, ticketw, tickets, suppo... | [help support ticket please, help support tick... |
| 27 | 26 | 429 | 26_meth_methbusters_methamphetamine_crystal | [meth, methbusters, methamphetamine, crystal, ... | [crystal meth uk, crystal meth, crystal meth v... |
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.6434006690979004 Davies_bouldin_score: 0.4681034572960446
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 5))
topic_model.update_topics(tc1.corpus, vectorizer_model=vectorizer_model)
topic_model.visualize_topics()
topic_model.visualize_heatmap()
topic_model.visualize_hierarchy()
reduced_embeddings = UMAP(n_neighbors=15, n_components=2,
min_dist=0.0, metric='cosine').fit_transform(tc1.corpus_embeddings)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True)
new_topics = topic_model.reduce_outliers(tc1.corpus, topics, strategy="embeddings", embeddings=tc1.corpus_embeddings, threshold=0.5)
topic_model.update_topics(tc1.corpus, topics=new_topics)
topic_model.get_topic_info()
2024-06-27 14:34:02,549 - BERTopic - WARNING: Using a custom list of topic assignments may lead to errors if topic reduction techniques are used afterwards. Make sure that manually assigning topics is the last step in the pipeline.Note that topic embeddings will also be created through weightedc-TF-IDF embeddings instead of centroid embeddings.
| Topic | Count | Name | Representation | Representative_Docs | |
|---|---|---|---|---|---|
| 0 | -1 | 27323 | -1_anyone_vendor_order_review | [anyone, vendor, order, review, new, get, acco... | [dutchdrugz updates promo active till market p... |
| 1 | 0 | 5137 | 0_weed_cannabis_cart_review | [weed, cannabis, cart, review, thc, vendor, oz... | [sale girl scout cookie carts strains oz lb us... |
| 2 | 1 | 2700 | 1_help_login_need_account | [help, login, need, account, sub, back, passwo... | [hey really could use help advice thanks, erro... |
| 3 | 2 | 2601 | 2_cocaine_coke_heroin_drug | [cocaine, coke, heroin, drug, vendor, uk, best... | [colombian coke brazil ship world wide promoti... |
| 4 | 3 | 2270 | 3_xanax_mg_adderall_alprazolam | [xanax, mg, adderall, alprazolam, bar, diazepa... | [adderall mg ir adderall mg xanax super sale, ... |
| 5 | 4 | 2031 | 4_order_shipping_package_delivery | [order, shipping, package, delivery, shipped, ... | [informed delivery showing package, usa canada... |
| 6 | 5 | 1861 | 5_darknet_dark_tor_web | [darknet, dark, tor, web, onion, dark web, dar... | [three student arrested dark web drug traffick... |
| 7 | 6 | 1826 | 6_empire_empire market_empire empire_market | [empire, empire market, empire empire, market,... | [empire anyone else, empire market back, empir... |
| 8 | 7 | 1653 | 7_mdma_pill_mda_xtc | [mdma, pill, mda, xtc, mdma vendor, mg, usa, p... | [sale xtc pill mg mda us ca, uk mdma pill vend... |
| 9 | 8 | 1628 | 8_card_carding_cc_credit | [card, carding, cc, credit, cvv, credit card, ... | [carding amazon gift card, gift card prepaid d... |
| 10 | 9 | 3010 | 9_vendor_vendor vendor_inquiry_vendor inquiry | [vendor, vendor vendor, inquiry, vendor inquir... | [nmm giving vendor runaround lying acting shad... |
| 11 | 10 | 1741 | 10_scam_scammer_exit_scamming | [scam, scammer, exit, scamming, scammed, exit ... | [market exit scam next, scam alert ukdrugdeale... |
| 12 | 11 | 1147 | 11_counterfeit_id_fake_passport | [counterfeit, id, fake, passport, fake id, not... | [buy counterfeit money real fake document, buy... |
| 13 | 12 | 1202 | 12_dream_nightmare_dream market_market | [dream, nightmare, dream market, market, night... | [dream market still, dream market, eleven drea... |
| 14 | 13 | 1009 | 13_lsd_ug_tab_lsd vendor | [lsd, ug, tab, lsd vendor, acid, free, lsd tab... | [lsd blotter tab ug top quality, point one fre... |
| 15 | 14 | 854 | 14_monero_btc_bitcoin_coin | [monero, btc, bitcoin, coin, crypto, wallet, b... | [looking best safe way buy large amount bitcoi... |
| 16 | 15 | 926 | 15_review_vendor review_vendor_review vendor | [review, vendor review, vendor, review vendor,... | [needing send sample bar trusted reviewer woul... |
| 17 | 16 | 681 | 16_heard_anyone_anyone heard_happened | [heard, anyone, anyone heard, happened, has, h... | [anybody heard theoutfit, anybody heard pickle... |
| 18 | 17 | 989 | 17_market_market market_new market_new | [market, market market, new market, new, apoll... | [market anyone else, market, currently working... |
| 19 | 18 | 764 | 18_crosspost_review crosspost_crosspost vendor... | [crosspost, review crosspost, crosspost vendor... | [envoy want crosspost, could vendor crosspost,... |
| 20 | 19 | 671 | 19_deposit_deposited_ticket_address | [deposit, deposited, ticket, address, double, ... | [missing deposit double deposit please help, a... |
| 21 | 20 | 596 | 20_pgp_key_pgp key_public | [pgp, key, pgp key, public, public pgp, messag... | [pgp public key, market pgp key, find pgp key] |
| 22 | 21 | 551 | 21_dispute_dispute dispute_mod_moderator | [dispute, dispute dispute, mod, moderator, ple... | [moderator dispute day, moderator please help ... |
| 23 | 22 | 480 | 22_cryptonia_samsara_samsara market_cryptonia ... | [cryptonia, samsara, samsara market, cryptonia... | [cryptonia market, market king samsara crypton... |
| 24 | 23 | 485 | 23_wsm_wsm wsm_wsm vendor_vendor wsm | [wsm, wsm wsm, wsm vendor, vendor wsm, vendor,... | [wsm vendor, wsm back, wsm down] |
| 25 | 24 | 468 | 24_ketamine_ketamine vendor_mdma ketamine_keta... | [ketamine, ketamine vendor, mdma ketamine, ket... | [ketamine us, get ketamine, ketamine anyone] |
| 26 | 25 | 458 | 25_ticket_support ticket_support_please | [ticket, support ticket, support, please, mont... | [help support ticket please, help support tick... |
| 27 | 26 | 467 | 26_meth_crystal meth_crystal_meth vendor | [meth, crystal meth, crystal, meth vendor, met... | [crystal meth uk, crystal meth, crystal meth v... |
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
zero_shot_topics = pd.read_csv('../../../intent_crime.csv')['intent'].tolist()
dict_zero_shots_25 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.25)
dict_zero_shots_2 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.2)
dict_zero_shots_17 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.17)
dict_zero_shots_15 = ppt.assign_labels_to_topics(classifier, topic_model, zero_shot_topics, len(set(new_topics) - {-1}), threshold=.15)
pd.DataFrame(list(dict_zero_shots_25.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_025.csv', index=False)
pd.DataFrame(list(dict_zero_shots_2.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_020.csv', index=False)
pd.DataFrame(list(dict_zero_shots_17.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_017.csv', index=False)
pd.DataFrame(list(dict_zero_shots_15.items()), columns=['Topic', 'Labels']).to_csv('ZeroShotClassificationResults/all-MiniLM-L6-v2_400/zero_shot_015.csv', index=False)
dict_zero_shots_2[18] = 'crosspost vendor'
dict_zero_shots_2[22] = 'samsara market'
dict_zero_shots_2[23] = 'wsm market'
topic_model.set_topic_labels(dict_zero_shots_2)
topic_model.visualize_documents(tc1.corpus, reduced_embeddings=reduced_embeddings,
hide_document_hover=True, hide_annotations=True, custom_labels=True)
topic_model.visualize_hierarchy(custom_labels=True)
topic_model.visualize_topics()
topic_model.visualize_barchart(top_n_topics=25, custom_labels=True, n_words=10)
umap_embeddings = topic_model.umap_model.fit_transform(tc1.corpus_embeddings)
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
X=umap_embeddings[np.array(indices)]
labels = [topic for index, topic in enumerate(new_topics) if topic != -1]
silhouette_scores = silhouette_score(X, labels)
print(f"silhouette_score: {silhouette_scores}")
print(f"Davies_bouldin_score: {davies_bouldin_score(X, labels)}")
silhouette_score: 0.5175204277038574 Davies_bouldin_score: 0.7919422601150089
df['name_thread'] = df['name_thread'].str.lower().dropna()
df.drop_duplicates(subset='name_thread', inplace=True)
df.dropna(subset=['name_thread'], inplace=True)
created_on = df['created_on'].tolist()
len(created_on)
topics_over_time = topic_model.topics_over_time(tc1.corpus, created_on,
global_tuning=True, evolution_tuning=True, nr_bins=100)
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10, width=1250, height=700, custom_labels=True)
15it [00:24, 1.62s/it]
indices = [index for index, topic in enumerate(new_topics) if topic != -1]
corpus_valid = [tc1.corpus[i] for i in indices]
created_on_valid = [created_on[i] for i in indices]
embeddings_valid = [tc1.corpus_embeddings[i] for i in indices]
topics_valid = [new_topics[i] for i in indices]
probs_valid = [probs[i] for i in indices]
results = pd.DataFrame({
'Document': corpus_valid,
'Embedding': embeddings_valid,
'Topic': topics_valid,
'Probability': probs_valid,
'Created_on': created_on_valid,
})
results_final = pd.merge(results, topic_model.get_topic_info(), on='Topic')
results_final['UMAP_embedding'] = list(X)
print(results_final.shape)
results_final.head()
(38274, 11)
| Document | Embedding | Topic | Probability | Created_on | Count | Name | CustomName | Representation | Representative_Docs | UMAP_embedding | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | review empire vendor acidbern | [-0.07762138, -0.049061198, -0.046745114, -0.0... | 6 | 0.527385 | 2020-01-09 | 1826 | 6_empire_empire market_empire empire_market | empire market | [empire, empire market, empire empire, market,... | [empire anyone else, empire market back, empir... | [9.086779, 3.6718397, 8.9006195, -1.1745992, 1... |
| 1 | vendor shipping combine priority | [-0.027722627, -0.0031221025, 0.01195772, -0.0... | 4 | 0.962274 | 2019-11-06 | 2031 | 4_order_shipping_package_delivery | order | [order, shipping, package, delivery, shipped, ... | [informed delivery showing package, usa canada... | [9.679236, 2.7164314, 8.733615, 0.011899776, 8... |
| 2 | open ticket since may ticket | [0.055031013, -0.018210536, -0.0026789573, -0.... | 25 | 1.000000 | 2020-01-09 | 458 | 25_ticket_support ticket_support_please | ticket support - ask help | [ticket, support ticket, support, please, mont... | [help support ticket please, help support tick... | [9.901975, 5.2703958, 11.463735, 0.47217792, 8... |
| 3 | vendor inquiry destroid dream | [-0.023196185, 0.0573189, 0.028408512, -0.0222... | 9 | 0.000000 | 2019-11-06 | 3010 | 9_vendor_vendor vendor_inquiry_vendor inquiry | inquiry - vendor vendor - vendor | [vendor, vendor vendor, inquiry, vendor inquir... | [nmm giving vendor runaround lying acting shad... | [9.912251, 4.028657, 7.623224, -0.7158077, 9.2... |
| 4 | morrison saver stamps uk money maker easiest m... | [-0.020903945, 0.050762244, -0.041445963, 0.01... | 11 | 0.799023 | 2020-01-09 | 1147 | 11_counterfeit_id_fake_passport | counterfeit money - fake IDs | [counterfeit, id, fake, passport, fake id, not... | [buy counterfeit money real fake document, buy... | [9.859931, 3.1459394, 9.145497, -1.0489817, 9.... |
topic_model.save("Models/topic_model_all-MiniLM-L6-v2_400", serialization="pickle", save_ctfidf=True, save_embedding_model=model)
results_final.to_parquet('ResultsBERTopic/BERTopic_all-MiniLM-L6-v2_400.parquet')
import nbconvert
!jupyter nbconvert --to html show_results.ipynb